In [89]:
import pandas as pd

commits = pd.read_csv("../../intellij-community/git_oneline.log")    
commits.head()


Out[89]:
sha timestamp author
0 a0fc4d3828f 1502119397 peter
1 dec0d165974 1502122966 Anna.Kozlova
2 f1fd8e4bc7a 1502122584 Egor.Ushakov
3 f965b772530 1502121596 Alexey Kudravtsev
4 62759679eb4 1502121252 Alexey Kudravtsev

In [92]:
commits_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228801 entries, 0 to 228800
Data columns (total 4 columns):
sha          228801 non-null object
date         228801 non-null object
timestamp    228801 non-null int64
author       228801 non-null object
dtypes: int64(1), object(3)
memory usage: 7.0+ MB

In [93]:
commits['author'].value_counts().head(10).plot(kind='pie', figsize=(5,5))


Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x252e3207128>

In [94]:
commits.head()


Out[94]:
sha timestamp author
0 a0fc4d3828f 1502119397 peter
1 dec0d165974 1502122966 Anna.Kozlova
2 f1fd8e4bc7a 1502122584 Egor.Ushakov
3 f965b772530 1502121596 Alexey Kudravtsev
4 62759679eb4 1502121252 Alexey Kudravtsev

In [95]:
commits = commits.set_index(pd.to_datetime(commits['timestamp'], unit="s"))
commits.head()


Out[95]:
sha timestamp author
timestamp
2017-08-07 15:23:17 a0fc4d3828f 1502119397 peter
2017-08-07 16:22:46 dec0d165974 1502122966 Anna.Kozlova
2017-08-07 16:16:24 f1fd8e4bc7a 1502122584 Egor.Ushakov
2017-08-07 15:59:56 f965b772530 1502121596 Alexey Kudravtsev
2017-08-07 15:54:12 62759679eb4 1502121252 Alexey Kudravtsev

In [96]:
commits_per_month['commits_cum'] = commits_per_month['author'].cumsum()
commits_per_month


Out[96]:
sha timestamp author time commits_cum
time
2004-11-30 18 18 18 18 18
2004-12-31 29 29 29 29 47
2005-01-31 380 380 380 380 427
2005-02-28 595 595 595 595 1022
2005-03-31 664 664 664 664 1686
2005-04-30 601 601 601 601 2287
2005-05-31 669 669 669 669 2956
2005-06-30 1046 1046 1046 1046 4002
2005-07-31 1181 1181 1181 1181 5183
2005-08-31 428 428 428 428 5611
2005-09-30 362 362 362 362 5973
2005-10-31 403 403 403 403 6376
2005-11-30 594 594 594 594 6970
2005-12-31 526 526 526 526 7496
2006-01-31 744 744 744 744 8240
2006-02-28 661 661 661 661 8901
2006-03-31 706 706 706 706 9607
2006-04-30 741 741 741 741 10348
2006-05-31 738 738 738 738 11086
2006-06-30 846 846 846 846 11932
2006-07-31 1161 1161 1161 1161 13093
2006-08-31 683 683 683 683 13776
2006-09-30 919 919 919 919 14695
2006-10-31 752 752 752 752 15447
2006-11-30 597 597 597 597 16044
2006-12-31 524 524 524 524 16568
2007-01-31 690 690 690 690 17258
2007-02-28 684 684 684 684 17942
2007-03-31 962 962 962 962 18904
2007-04-30 796 796 796 796 19700
... ... ... ... ... ...
2015-03-31 2467 2467 2467 2467 165752
2015-04-30 1943 1943 1943 1943 167695
2015-05-31 1737 1737 1737 1737 169432
2015-06-30 1680 1680 1680 1680 171112
2015-07-31 1965 1965 1965 1965 173077
2015-08-31 2012 2012 2012 2012 175089
2015-09-30 2360 2360 2360 2360 177449
2015-10-31 3252 3252 3252 3252 180701
2015-11-30 2206 2206 2206 2206 182907
2015-12-31 1901 1901 1901 1901 184808
2016-01-31 1207 1207 1207 1207 186015
2016-02-29 2076 2076 2076 2076 188091
2016-03-31 2604 2604 2604 2604 190695
2016-04-30 2352 2352 2352 2352 193047
2016-05-31 2044 2044 2044 2044 195091
2016-06-30 2371 2371 2371 2371 197462
2016-07-31 2114 2114 2114 2114 199576
2016-08-31 2240 2240 2240 2240 201816
2016-09-30 2177 2177 2177 2177 203993
2016-10-31 3130 3130 3130 3130 207123
2016-11-30 2487 2487 2487 2487 209610
2016-12-31 2033 2033 2033 2033 211643
2017-01-31 1981 1981 1981 1981 213624
2017-02-28 2437 2437 2437 2437 216061
2017-03-31 3001 3001 3001 3001 219062
2017-04-30 2578 2578 2578 2578 221640
2017-05-31 2424 2424 2424 2424 224064
2017-06-30 2365 2365 2365 2365 226429
2017-07-31 2040 2040 2040 2040 228469
2017-08-31 332 332 332 332 228801

154 rows × 5 columns


In [38]:
commits_per_month['commits_cum'].plot()


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x24f4ce88a58>

In [ ]:
commits.

In [77]:
%matplotlib inline
commits_raw['time'].plot()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-77-47f2e92a8e6f> in <module>()
      1 get_ipython().magic('matplotlib inline')
----> 2 commits_raw['time'].plot()

C:\dev\apps\Anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2444                            colormap=colormap, table=table, yerr=yerr,
   2445                            xerr=xerr, label=label, secondary_y=secondary_y,
-> 2446                            **kwds)
   2447     __call__.__doc__ = plot_series.__doc__
   2448 

C:\dev\apps\Anaconda3\lib\site-packages\pandas\plotting\_core.py in plot_series(data, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   1886                  yerr=yerr, xerr=xerr,
   1887                  label=label, secondary_y=secondary_y,
-> 1888                  **kwds)
   1889 
   1890 

C:\dev\apps\Anaconda3\lib\site-packages\pandas\plotting\_core.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   1680         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   1681 
-> 1682     plot_obj.generate()
   1683     plot_obj.draw()
   1684     return plot_obj.result

C:\dev\apps\Anaconda3\lib\site-packages\pandas\plotting\_core.py in generate(self)
    234     def generate(self):
    235         self._args_adjust()
--> 236         self._compute_plot_data()
    237         self._setup_subplots()
    238         self._make_plot()

C:\dev\apps\Anaconda3\lib\site-packages\pandas\plotting\_core.py in _compute_plot_data(self)
    343         if is_empty:
    344             raise TypeError('Empty {0!r}: no numeric data to '
--> 345                             'plot'.format(numeric_data.__class__.__name__))
    346 
    347         self.data = numeric_data

TypeError: Empty 'DataFrame': no numeric data to plot

In [73]:
commits_raw['unix'].dtypes


Out[73]:
dtype('int64')

In [38]:
commits_raw['time'] = pd.to_datetime(commits_raw['timestamp'])
commits_raw.dtypes


Out[38]:
sha                  object
date                 object
timestamp             int64
author               object
time         datetime64[ns]
dtype: object

In [ ]:
commits_raw.head()

In [ ]:
commits_raw

In [1]:
import git 

GIT_LOG_FILE = r'C:/dev/repos/intellij-community/'
repo = git.Repo(GIT_LOG_FILE, odbt=git.GitCmdObjectDB)
g = repo.git
log = g.log('--all', '--numstat', '--no-renames', '--pretty=format:#%h#%ad#%aN')
log[0:100]


Out[1]:
'#a0fc4d3828f#Mon Aug 7 17:23:17 2017 +0200#peter\n3\t9\tjava/java-analysis-impl/src/com/intellij/codeIn'

After this, we have to tell Git which information we want. We can do this via the pretty-format option.

For each commit, we choose to create a header line with the following commit info (by using --pretty=format:'--%h--%ad--%aN'), which gives us the following output:

--fa1ca6f--Thu Dec 22 08:04:18 2016 +0100--feststelltaste
It contains the SHA key, the timestamp as well as the author's name of the commit, separated by a character that isn't certaninly in these information--. My favorite separator for this is \u0012

We also want to have some details about the modifications of each file per commit. This is why we use the --numstat flag.

Together with the --all flag to get all commits and the --no-renames flag to avoid commits that only rename files, we retrieve all the needed information directly via Git.

For each other row, we got some statistics about the modified files:

2   0   src/main/asciidoc/appendices/bibliography.adoc

It contains the number of lines inserted, the number of lines deleted and the relative path of the file. With a little trick and a little bit of data wrangling, we can read that information into a nicely structured DataFrame.

The first entries of that file look something like this:

Let's get started!

Import the data

First, I'll show you my approach on how to read nearly everything into a DataFrame. The key is to use Pandas' read_csv for reading "non-character separated values". How to do that? We simply choose a separator that doesn't occur in the file that we want to read. My favorite character for this is the "DEVICE CONTROL TWO" character U+0012. I haven't encountered a situation yet where this character was included in a data set.

We just read our git.log file without any headers (because there are none) and give the only column a nice name.


In [3]:
import pandas as pd
from io import StringIO

commits_raw = pd.read_csv(StringIO(log), 
    sep="#",
    header=None,
    names=['file_stats','sha', 'date', 'author'])
commits_raw.head()


Out[3]:
file_stats sha date author
0 NaN a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter
1 3\t9\tjava/java-analysis-impl/src/com/intellij... NaN NaN NaN
2 17\t0\tjava/java-tests/testData/inspection/nul... NaN NaN NaN
3 18\t0\tjava/java-tests/testData/inspection/nul... NaN NaN NaN
4 6\t0\tjava/java-tests/testSrc/com/intellij/jav... NaN NaN NaN

Data Wrangling

OK, but now we have a problem data wrangling challenge. We have the commit info as well as the statistic for the modified file in one column, but they don't belong together. What we want is to have the commit info along with the file statistics in separate columns to get some serious analysis started.

Commit info

Let's treat the commit info first. Luckily, we set some kind of anchor or marker to identify the commit info: Each commit info starts with a --. So let's extract all the commit info from the original commits DataFrame.


In [5]:
commit_metadata = commits_raw[['sha', 'date', 'author']].fillna(method='ffill')
commit_metadata.head(5)


Out[5]:
sha date author
0 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter
1 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter
2 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter
3 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter
4 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter

With this, we can focus on extracting the information of a commit info row. The next command could be looking a little frightening, but don't worry. We go through it step by step.


In [6]:
file_info = commits_raw['file_stats'].dropna().str.split("\t", expand=True)
file_info.columns = ['additions', "deletions", "filename"]
file_info.head()


Out[6]:
additions deletions filename
1 3 9 java/java-analysis-impl/src/com/intellij/codeI...
2 17 0 java/java-tests/testData/inspection/nullablePr...
3 18 0 java/java-tests/testData/inspection/nullablePr...
4 6 0 java/java-tests/testSrc/com/intellij/java/code...
6 3 3 java/java-psi-impl/src/com/intellij/psi/scope/...

In [7]:
file_info['additions'] = pd.to_numeric(file_info['additions'], errors='coerce')
file_info['deletions'] = pd.to_numeric(file_info['deletions'], errors='coerce')
file_info.dtypes


Out[7]:
additions    float64
deletions    float64
filename      object
dtype: object

In [8]:
commits = commit_metadata.join(file_info, how='right')
commits = commits.dropna()
commits.head()


Out[8]:
sha date author additions deletions filename
1 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter 3.0 9.0 java/java-analysis-impl/src/com/intellij/codeI...
2 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter 17.0 0.0 java/java-tests/testData/inspection/nullablePr...
3 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter 18.0 0.0 java/java-tests/testData/inspection/nullablePr...
4 a0fc4d3828f Mon Aug 7 17:23:17 2017 +0200 peter 6.0 0.0 java/java-tests/testSrc/com/intellij/java/code...
6 dec0d165974 Mon Aug 7 18:22:46 2017 +0200 Anna.Kozlova 3.0 3.0 java/java-psi-impl/src/com/intellij/psi/scope/...

In [87]:
commits.groupby('author').sum()[['additions']].sort_values(by='additions', ascending=False)


Out[87]:
additions
author
Dmitry Jemerov 3415183.0
Maxim Shafirov 1317300.0
Alexey Kudravtsev 1091957.0
Dmitry Trofimov 836951.0
Ekaterina Tuzova 834714.0
Anna Kozlova 807367.0
Roman Shevchenko 761080.0
Dmitry Avdeev 648392.0
no_reply@jetbrains.com 563445.0
Mikhail Golubev 522789.0
Eugene Kudelevsky 485821.0
Bas Leijdekkers 471711.0
irengrig 457734.0
peter 415278.0
Vladimir Krivosheev 396594.0
Anton Makeev 389985.0
Kirill Likhodedov 385721.0
Aleksey Pivovarov 384356.0
Eugene Zhuravlev 362103.0
Elizaveta Shashkova 359524.0
Alexey Gopachenko 337642.0
nik 317163.0
Anna.Kozlova 316704.0
Andrey Vlasovskikh 306105.0
anna 301108.0
Sergey Ignatov 265608.0
Denis.Zhdanov 240963.0
Maxim.Mossienko 240159.0
Peter Gromov 234667.0
Konstantin Bulenkov 207978.0
... ...
spLeaner 2.0
Marcus Hüsgen 2.0
Evgeny Naumenko 2.0
Sergey Zhukov 2.0
Stephen Friedrich 2.0
Tobias Bieniek 2.0
Ben Mintz 2.0
Dmitry Zhuravlev 2.0
suman-ganta 1.0
ryenus 1.0
Adam Batkin 1.0
Matthew Gharrity 1.0
Lucas Smaira 1.0
Amaury 1.0
Leonya Khachaturov 1.0
Raluca Sauciuc 1.0
Roma Leventov 1.0
Daniel Espendiller 1.0
Igor Mikhailuk 1.0
Sergej Zagursky 1.0
Sergey Golovachev 1.0
dmitrii.kravchenko 1.0
Anton Lobov 1.0
auchri 1.0
Chetan.Mehrotra 1.0
Deepanshu Gupta 1.0
Matthias Vill 1.0
Ladislav Thon 0.0
Ryan Prichard 0.0
animaru 0.0

454 rows × 1 columns

OK, this part is ready, let's have a look at the file statistics!

We're done!

Complete code block

To much code to look through? Here is everything from above in a condensed format.

Just some milliseconds to run through, not bad!

Summary

In this notebook, I showed you how to read some non-perfect structured data via the non-character separator trick. I also showed you how to transform the rows that contain multiple kinds of data into one nicely structured DataFrame.

Now that we have the Git repository DataFrame, we can do some nice things with it e. g. visualizing the code churn of a project, but that's a story for another notebook! But to give you a short preview:


In [88]:
%matplotlib inline
timed_commits = commits.set_index(pd.DatetimeIndex(commits['date']))[['insertions', 'deletions']].resample('1D').sum()
(timed_commits['insertions'] - timed_commits['deletions']).cumsum().fillna(method='ffill').plot()


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-88-aebe6ddd42b6> in <module>()
      1 get_ipython().magic('matplotlib inline')
----> 2 timed_commits = commits.set_index(pd.DatetimeIndex(commits['date']))[['insertions', 'deletions']].resample('1D').sum()
      3 (timed_commits['insertions'] - timed_commits['deletions']).cumsum().fillna(method='ffill').plot()

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2054         if isinstance(key, (Series, np.ndarray, Index, list)):
   2055             # either boolean or fancy integer index
-> 2056             return self._getitem_array(key)
   2057         elif isinstance(key, DataFrame):
   2058             return self._getitem_frame(key)

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
   2098             return self.take(indexer, axis=0, convert=False)
   2099         else:
-> 2100             indexer = self.loc._convert_to_indexer(key, axis=1)
   2101             return self.take(indexer, axis=1, convert=True)
   2102 

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1229                 mask = check == -1
   1230                 if mask.any():
-> 1231                     raise KeyError('%s not in index' % objarr[mask])
   1232 
   1233                 return _values_from_object(indexer)

KeyError: "['insertions'] not in index"

In [ ]:
%matplotlib inline
commits['author'].value_counts().plot(kind='pie', figsize=(10,10))

Stay tuned!